library(ggplot2)
library(dplyr)
library(fst)
library(reshape2)
library(miceadds)

load("main_data.rdata")

# make vector of continuing municipalities: 

kom_cont <- 
  c("101", "147", "151", "153", "155", "157", "159", "161", 
    "163", "165", "167", "169", "173", "175", "183", "185", 
    "187", "201", "217", "223", "400", "253", "269", "329", 
    "461", "563", "607", "727", "741", "751", "773", "825")

#create function for splitting income residuals in brackets


var <- c("run_kv", "elected_kv")
lab <- c("Running for \nmunicipality", 
         "Elected for \nmunicipality")

data_out <- tibble()
data_model <- tibble()

for(k in seq(1993, 2013, by = 4)){
  
  data_an <- 
    data_comp %>% 
    filter(four_years == k) 
  
  bef <- 
    read.fst(paste("bef", k, ".fst", sep = ""))  
  
  edu <- 
    read.fst(paste("udda_recoded", k, ".fst", sep = ""))
  
  data_an <- 
    left_join(data_an, bef, by = "PNR") %>% 
    left_join(., edu, by = "PNR") # %>% sample_n(500000) # for fitting
  
  data_an <- 
    data_an %>% 
    mutate(edu_years = ifelse(education == "Grundskole", 9, 
                              ifelse(education == "Erhvervsfaglige praktik- og hovedforl?b", 13,
                                     ifelse(education == "Almengymnasiale uddannelser", 12, 
                                            ifelse(education == "Erhvervsgymnasiale uddannelser", 12, 
                                                   ifelse(education == "Korte videreg?ende uddannelser", 14, 
                                                          ifelse(education == "Mellemlange videreg?ende uddannelser", 15,  
                                                                 ifelse(education == "Bachelor", 15,  
                                                                        ifelse(education == "Lange videreg?ende uddannelser", 17,
                                                                               ifelse(education == "Forskeruddannelser", 20, NA))))))))))
  
  
  mun_comp <- 
    data_an %>% 
    group_by(KOM) %>% 
    summarise(kom_comp = mean(edu_years, na.rm = TRUE))
  
  data_an <- 
    left_join(data_an, mun_comp, by = "KOM")
  
  for(j in 1:2){
    data_merge <- 
      data_an %>% 
      filter(.[,paste(var[j], "_", k, sep = "")] == 1) %>%
      filter(!KOM %in% kom_cont) 
    
    data_cont <- 
      data_an %>% 
      filter(.[,paste(var[j], "_", k, sep = "")] == 1) %>%
      filter(KOM %in% kom_cont)
    
    data_out_temp <-
      data_frame(year = k,
                 lab  = lab[j],
                 mean_merge = mean(data_merge$edu_years, na.rm = TRUE),
                 mean_cont  = mean(data_cont$edu_years, na.rm = TRUE),
                 mean_merge_kom = mean(data_merge$edu_years - 
                                         data_merge$kom_comp, na.rm = TRUE),
                 mean_cont_kom  = mean(data_cont$edu_years  - 
                                         data_cont$kom_comp, na.rm = TRUE),
                 se_merge = sd(data_merge$edu_years, na.rm = TRUE)/sqrt(sum(!is.na(data_merge$edu_years))),
                 se_cont  = sd(data_cont$edu_years, na.rm = TRUE)/ sqrt(sum(!is.na(data_cont$edu_years))),
                 se_merge_kom = sd(data_merge$edu_years - 
                                         data_merge$kom_comp, na.rm = TRUE)/sqrt(sum(!is.na(data_merge$edu_years))),
                 se_cont_kom  = sd(data_cont$edu_years  - 
                                         data_cont$kom_comp, na.rm = TRUE)/sqrt(sum(!is.na(data_cont$edu_years))))
    
    data_out <- 
      bind_rows(data_out, data_out_temp)
    
    data_year_out <- 
      bind_rows(data_merge %>% 
                  select(c("PNR", "edu_years", "kom_comp", "KOM")) %>% 
                  mutate(year = k,
                         type = var[j], 
                         merg = 1),
                data_cont %>% 
                  select(c("PNR", "edu_years", "kom_comp", "KOM")) %>% 
                  mutate(year = k,
                         type = var[j], 
                         merg = 0))
    
    data_model <- 
      bind_rows(data_model, data_year_out)
  }
  print(k)
  print(Sys.time())
}

data_plot <- 
  melt(as.data.frame(data_out), 
       id = c("year", "lab"))


data_points <- 
  data_plot %>% 
  filter(variable == "mean_merge" | 
           variable == "mean_cont") %>% 
  mutate(estimate = value,
         variable = case_when(variable == "mean_merge" ~ "merg",
                              variable == "mean_cont" ~ "cont"))  %>%
  select(c("year", "lab", "variable", "estimate"))


data_ses <- 
  data_plot %>% 
  filter(variable == "se_merge" | 
           variable == "se_cont") %>% 
  mutate(se = value,
         variable = case_when(variable == "se_merge" ~ "merg",
                              variable == "se_cont" ~ "cont"))  %>%
  select(c("year", "lab", "variable", "se")) 

data_plot1 <- 
  left_join(data_points, data_ses) %>% 
  mutate(year = ifelse(variable == "merg", year - 0.2, year + 0.2))

data_points_kom <- 
  data_plot %>% 
  filter(variable == "mean_merge_kom" | 
           variable == "mean_cont_kom") %>% 
  mutate(estimate = value,
         variable = case_when(variable == "mean_merge_kom" ~ "merg",
                              variable == "mean_cont_kom" ~ "cont"))  %>%
  select(c("year", "lab", "variable", "estimate")) 

data_ses_kom <- 
  data_plot %>% 
  filter(variable == "se_merge_kom" | 
           variable == "se_cont_kom") %>% 
  mutate(se = value,
         variable = case_when(variable == "se_merge_kom" ~ "merg",
                              variable == "se_cont_kom" ~ "cont"))  %>%
  select(c("year", "lab", "variable", "se")) 

data_plot2 <- 
  left_join(data_points_kom, data_ses_kom) %>% 
  mutate(year = ifelse(variable == "merg", year - 0.2, year + 0.2))

plot1 <- 
  ggplot(data = data_plot1,
         aes(x = year,
             y = estimate,
             ymin = estimate + qnorm(0.025) * se,
             ymax = estimate + qnorm(0.975) * se,
             alpha = variable)) + 
  facet_grid(lab ~ .) +
  geom_errorbar(width = 0) + 
  geom_point() + 
  theme_classic() +
  scale_y_continuous("Years of education") + 
  scale_x_continuous("Year", breaks = seq(1993, 2013, 4)) +
  geom_vline(xintercept = 2003, linetype = "dashed", alpha = 0.5) + 
  scale_alpha_discrete("", range = c(0.25, 1), 
                       labels = c("Continuing \nmunicipalities",
                                  "Amalgamated \nmunicipalities")) +
  theme(panel.spacing = unit(2, "lines"))

plot2 <- 
  ggplot(data = data_plot2,
         aes(x = year,
             y = estimate,
             ymin = estimate + qnorm(0.025) * se,
             ymax = estimate + qnorm(0.975) * se,
             alpha = variable)) + 
  facet_grid(lab ~ .) +
  geom_errorbar(width = 0) + 
  geom_point() + 
  theme_classic() +
  scale_y_continuous("Years of education relative to municipality mean") + 
  scale_x_continuous("Year", breaks = seq(1993, 2013, 4)) +
  geom_vline(xintercept = 2003, linetype = "dashed", alpha = 0.5) + 
  scale_alpha_discrete("", range = c(0.25, 1), 
                       labels = c("Continuing \nmunicipalities",
                                  "Amalgamated \nmunicipalities")) +
  theme(panel.spacing = unit(2, "lines"))

# Find model estimates: 

ests_elected <- matrix(NA, nrow = 10, ncol = 4)
ests_running <- matrix(NA, nrow = 10, ncol = 4)

for(i in 1:4){
  k <- seq(2001, 2013, 4)[i]
  
  data_mod_elected <- 
    data_model %>% 
    filter((year == k - 8 | year == k) & type == "elected_kv") %>% 
    mutate(post = year == k,
           edu_years_kom = edu_years - kom_comp)
  
  mod_elected  <- lm.cluster(formula = edu_years ~ post*merg + as.factor(KOM), 
                             data = data_mod_elected, 
                             cluster = data_mod_elected$PNR)
  
  ests_elected[1:2, i] <- summary(mod_elected)[nrow(summary(mod_elected)), 1:2]
  ests_elected[  4, i] <- nrow(data_mod_elected)
  ests_elected[  5, i] <- length(unique(data_mod_elected$PNR))
  
  # repeat for models controlled for kom level
  
  mod_elected <- lm.cluster(formula = edu_years_kom ~ post*merg + as.factor(KOM), 
                            data = data_mod_elected, 
                            cluster = data_mod_elected$PNR)
  
  ests_elected[6:7, i] <- summary(mod_elected)[nrow(summary(mod_elected)), 1:2]
  ests_elected[  9, i] <- nrow(data_mod_elected)
  ests_elected[ 10, i] <- length(unique(data_mod_elected$PNR))
  
  # repeat for running
  
  data_mod_running <- 
    data_model %>% 
    filter((year == k - 8 | year == k) & type == "run_kv") %>% 
    mutate(post = year == k,
           edu_years_kom = edu_years - kom_comp)
  
  mod_running <- lm.cluster(formula = edu_years ~ post*merg + as.factor(KOM), 
                            data = data_mod_running, 
                            cluster = data_mod_running$PNR)
  
  ests_running[1:2, i] <- summary(mod_elected)[nrow(summary(mod_elected)), 1:2]
  ests_running[  4, i] <- nrow(data_mod_running)
  ests_running[  5, i] <- length(unique(data_mod_running$PNR))
  
  # models controlled for kom level
  
  mod_running <- lm.cluster(formula = edu_years_kom ~ post*merg + as.factor(KOM), 
                            data = data_mod_running, 
                            cluster = data_mod_running$PNR)
  
  ests_running[6:7, i] <- summary(mod_running)[nrow(summary(mod_elected)), 1:2]
  ests_running[  9, i] <- nrow(data_mod_running)
  ests_running[ 10, i] <- length(unique(data_mod_running$PNR))
}